Projections, Decompositions and Manifold Learning


In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from Orange.data import Table

Principal component analysis (PCA)


In [5]:
from Orange.projection import PCA, SparsePCA, RandomizedPCA

data = Table('iris')

pca = PCA(n_components=2)
pca_model = pca(data)

transformed = pca_model(data)
# Percentage of variance explained for each component
print('Explained variance ratio (first two components): %s' % str(pca_model.explained_variance_ratio_))

target_names = data.domain.class_var.values
for c, i, target_name in zip('rgb', [0, 1, 2], target_names):
    plt.scatter(transformed.X[data.Y == i, 0], transformed.X[data.Y == i, 1], c=c, label=target_name)
plt.legend(scatterpoints=1, loc=4)
plt.title('PCA of Iris dataset');


Explained variance ratio (first two components): [ 0.92461621  0.05301557]

Unsupervised dimensionality reduction with PCA


In [3]:
data = Table('ionosphere')
n_attributes = len(data.domain.attributes)
print('Original attributes: %d' % n_attributes)
print('Samples: %d' % len(data))

pca = PCA(n_components=6)
pca_model = pca(data)

# Plot the PCA spectrum
# Percentage of variance explained by each of the selected components
plt.plot(pca_model.explained_variance_ratio_, linewidth=2)
plt.xlabel('Components');
plt.ylabel('Explained variance ratio');
plt.xticks(np.arange(pca_model.n_components))

# The sum of explained variances of all components is equal to 1.0
# assert np.sum(pca_model.explained_variance_ratio_) > 0.99

# Reconstruction error
transformed = pca_model(data)
print('Reduced dimension: %d' % (len(transformed.domain.attributes)))

# Apply inverse PCA transform
# Transform data back to its original space, i.e., return an input 
# ``data'' whose transform would be ``transformed''
reconstructed = np.dot(transformed.X, pca_model.components_) + pca_model.mean_
rec_err = np.linalg.norm(data.X - reconstructed)
# Q: What happens when the number of components is equal to the number of attributes
print('Frobenius distance: %5.2f' % rec_err)


Original attributes: 32
Samples: 351
Reduced dimension: 6
Frobenius distance: 32.49

PCA is useful when high-dimensional data are flat in some dimensions


In [4]:
# Inspired by http://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_3d.html
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats

# Generate the data

e = np.exp(1)
np.random.seed(4)
y = np.random.normal(scale=0.5, size=(30000))
x = np.random.normal(scale=0.5, size=(30000))
z = np.random.normal(scale=0.1, size=len(x))

a = x + y
b = 2 * y
c = a - b + z

norm = np.sqrt(a.var() + b.var())
a /= norm
b /= norm

data = Table(np.c_[a, b, c])
print('Original attributes: %d' % len(data.domain.attributes))
pca = PCA(n_components=2)
pca_model = pca(data)
pca_score = pca_model.explained_variance_ratio_
print('Components: %d' % pca_model.n_components)
print('Explained variance ratio: %5.2f' % np.sum(pca_score))

# Plot 3D figures
def plot_figs(fig_num, elev, azim):
    fig = plt.figure(fig_num, figsize=(4, 3))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=elev, azim=azim)
    ax.scatter(a[::10], b[::10], c[::10], marker='o', alpha=.4)

    x_pca_axis, y_pca_axis, z_pca_axis = pca_model.components_.T * pca_score / pca_score.min()

    x_pca_axis, y_pca_axis, z_pca_axis = 3 * pca_model.components_.T
    x_pca_plane = np.r_[x_pca_axis[:2], - x_pca_axis[1::-1]]
    y_pca_plane = np.r_[y_pca_axis[:2], - y_pca_axis[1::-1]]
    z_pca_plane = np.r_[z_pca_axis[:2], - z_pca_axis[1::-1]]
    x_pca_plane.shape = (2, 2)
    y_pca_plane.shape = (2, 2)
    z_pca_plane.shape = (2, 2)
    ax.plot_surface(x_pca_plane, y_pca_plane, z_pca_plane)
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])


elev = -40
azim = -80
plot_figs(1, elev, azim)

elev = 30
azim = 20
plot_figs(2, elev, azim)

plt.show()


Original attributes: 3
Components: 2
Explained variance ratio:  1.00

CUR matrix decomposition


In [5]:
# See Clustering notebook

Manifold learning


In [6]:
from sklearn import datasets

n_class = 6
digits = datasets.load_digits(n_class=n_class)
print('Description: %s' % digits['DESCR']) 

data = Table(digits.data, Y=digits.target)
print('Original attributes: %d' % len(data.domain.attributes))
print('Samples: %d' % len(data))
unique_labels = set(data.Y)
print('Classes: %s' % ', '.join(map(str, unique_labels)))

# Plot an examplar image of a handwritten digit
img = np.zeros((8, 8))
img = data.X[0].reshape((8, 8))

plt.imshow(img, cmap=plt.cm.binary)
plt.xticks([]);
plt.yticks([]);


Description:  Optical Recognition of Handwritten Digits Data Set

Notes
-----
Data Set Characteristics:
    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each block. This generates
an input matrix of 8x8 where each element is an integer in the range
0..16. This reduces dimensionality and gives invariance to small
distortions.

For info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G.
T. Candela, D. L. Dimmick, J. Geist, P. J. Grother, S. A. Janet, and C.
L. Wilson, NIST Form-Based Handprint Recognition System, NISTIR 5469,
1994.

References
----------
  - C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their
    Applications to Handwritten Digit Recognition, MSc Thesis, Institute of
    Graduate Studies in Science and Engineering, Bogazici University.
  - E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika.
  - Ken Tang and Ponnuthurai N. Suganthan and Xi Yao and A. Kai Qin.
    Linear dimensionalityreduction using relevance weighted LDA. School of
    Electrical and Electronic Engineering Nanyang Technological University.
    2005.
  - Claudio Gentile. A New Approximate Maximal Margin Classification
    Algorithm. NIPS. 2000.

Original attributes: 64
Samples: 1083
Classes: 0.0, 1.0, 2.0, 3.0, 4.0, 5.0

In [7]:
X_rnd = np.random.rand(len(data), 2)
colors = plt.cm.rainbow(np.linspace(0, 1, n_class))
x_min, x_max = np.min(X_rnd, 0), np.max(X_rnd, 0)
X = (X_rnd - x_min) / (x_max - x_min)

for i in range(X.shape[0]):
    plt.text(X[i, 0], X[i, 1], str(data.Y[i]), color=colors[data.Y[i]], fontdict={'weight': 'bold', 'size': 9})

plt.xticks(())
plt.yticks(())
plt.tight_layout()
plt.show()


Multidimensional scaling (MDS)


In [8]:
from Orange.projection.manifold import MDS
from Orange.distance import Euclidean

# MDS finds a low-dimensional representation of the datain which the distances respect 
# well the distances in the original high-dimensional space

mds = MDS(dissimilarity=Euclidean)
mds_model = mds(data)

x_min, x_max = np.min(mds_model.embedding_, 0), np.max(mds_model.embedding_, 0)
X = (mds_model.embedding_ - x_min) / (x_max - x_min)
for i in range(X.shape[0]):
    plt.text(X[i, 0], X[i, 1], str(data.Y[i]), color=colors[data.Y[i]], fontdict={'weight': 'bold', 'size': 9})

plt.xticks(())
plt.yticks(())
plt.tight_layout()
plt.show()


Isomap


In [9]:
from Orange.projection import Isomap

n_neighbors = 30
isomap = Isomap(n_neighbors, n_components=2)
isomap_model = isomap(data)

x_min, x_max = np.min(isomap_model.embedding_, 0), np.max(isomap_model.embedding_, 0)
X = (isomap_model.embedding_ - x_min) / (x_max - x_min)
for i in range(X.shape[0]):
    plt.text(X[i, 0], X[i, 1], str(data.Y[i]), color=colors[data.Y[i]], fontdict={'weight': 'bold', 'size': 9})

plt.xticks(())
plt.yticks(())
plt.tight_layout()
plt.show()


Locally linear embedding (LLE)


In [10]:
from Orange.projection import LocallyLinearEmbedding

n_neighbors = 30
lle = LocallyLinearEmbedding(n_neighbors, n_components=2, method='standard')
lle_model = lle(data)

x_min, x_max = np.min(lle_model.embedding_, 0), np.max(lle_model.embedding_, 0)
X = (lle_model.embedding_ - x_min) / (x_max - x_min)
for i in range(X.shape[0]):
    plt.text(X[i, 0], X[i, 1], str(data.Y[i]), color=colors[data.Y[i]], fontdict={'weight': 'bold', 'size': 9})

plt.xticks(())
plt.yticks(())
plt.tight_layout()
plt.show()



In [10]: